In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
In [2]:
%%time
df = dd.read_parquet(r"C:\Users\prath\OneDrive\Desktop\microsoft_malware_prediction\Data Exploration\train.parquet")
df = df.compute()
CPU times: total: 23.4 s
Wall time: 1min 20s
In [3]:
print(df.shape)
(8921483, 83)
In [4]:
%%time
stats = []
for col in df.columns:
    stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)
CPU times: total: 12.5 s
Wall time: 15.9 s
Out[4]:
Feature Unique_values Percentage of missing values Percentage of values in the biggest category type
28 PuaMode 2 99.974119 99.974119 category
41 Census_ProcessorClass 3 99.589407 99.589407 category
8 DefaultBrowsersIdentifier 2017 95.141637 95.141637 float32
68 Census_IsFlightingInternal 2 83.044030 83.044030 float32
52 Census_InternalBatteryType 78 71.046809 71.046809 category
... ... ... ... ... ...
1 ProductName 6 0.000000 98.935569 category
45 Census_HasOpticalDiskDrive 2 0.000000 92.281272 int16
54 Census_OSVersion 469 0.000000 15.845202 category
55 Census_OSArchitecture 3 0.000000 90.858045 category
82 HasDetections 2 0.000000 50.020731 int16

83 rows × 5 columns

In [5]:
train= pd.DataFrame(df)
good_cols = list(df.columns)
for col in df.columns:
    rate = df[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9: # Removing Missing Values if 
        good_cols.remove(col)
In [6]:
train= df[good_cols]
In [7]:
def plot_category_percent_of_target(col,color):
    cat_percent = train[[col, 'HasDetections']].groupby(col, as_index=False).mean()
    cat_size = train[col].value_counts().reset_index(drop=False)
    cat_size.columns = [col, 'count']
    cat_percent = cat_percent.merge(cat_size, on=col, how='left')
    cat_percent['HasDetections'] = cat_percent['HasDetections'].fillna(0)
    cat_percent = cat_percent.sort_values(by='count', ascending=False)[:20]

    fig = px.bar(cat_percent, x='HasDetections', y=col, orientation='h', 
                 hover_data={'HasDetections': ':.2f', 'count': True}, 
                 color='HasDetections', color_continuous_scale=color)
    fig.update_layout(title='Percentage of HasDetections vs {}'.format(col),
                      xaxis_title='% of HasDetections (target)', yaxis_title=col)
    fig.show()
In [8]:
plot_category_percent_of_target("EngineVersion",color="teal")
In [9]:
plot_category_percent_of_target("AppVersion",color="Purples")
In [10]:
plot_category_percent_of_target("OsPlatformSubRelease",color="ice")
In [11]:
plot_category_percent_of_target("OsBuildLab",color="speed")
In [12]:
plot_category_percent_of_target("OsBuildLab",color="Magenta")
In [13]:
plot_category_percent_of_target("SkuEdition",color="balance")
In [14]:
plot_category_percent_of_target("SmartScreen",color="Portland")
In [15]:
plot_category_percent_of_target("Census_MDC2FormFactor",color="Redor")
In [16]:
plot_category_percent_of_target("Census_PrimaryDiskTypeName",color="Agsunset")
In [17]:
plot_category_percent_of_target("Census_ChassisTypeName",color="Deep")
In [18]:
plot_category_percent_of_target("Census_PowerPlatformRoleName",color="RdPu")
In [19]:
plot_category_percent_of_target("Census_OSVersion",color="Sunset")
In [20]:
plot_category_percent_of_target("Census_OSBranch",color="Oryel")
In [21]:
plot_category_percent_of_target("Census_OSEdition",color="Haline")
In [22]:
plot_category_percent_of_target("Census_OSSkuName",color="DarkMint")
In [23]:
plot_category_percent_of_target("Census_OSInstallTypeName",color="Reds")
In [24]:
plot_category_percent_of_target("Census_OSWUAutoUpdateOptionsName",color="Ice")
In [25]:
plot_category_percent_of_target("Census_ActivationChannel",color="Mint")